|
1378 | 1378 | _WorkspaceSizePerElemBias: 0
|
1379 | 1379 | _WorkspaceSizePerElemC: 4
|
1380 | 1380 | _staggerStrideShift: 3
|
| 1381 | + - 1LDSBuffer: 1 |
| 1382 | + ActivationAlt: false |
| 1383 | + ActivationFuncCall: false |
| 1384 | + ActivationFused: true |
| 1385 | + AssertAIGreaterThanEqual: -1 |
| 1386 | + AssertAILessThanEqual: -1 |
| 1387 | + AssertFree0ElementMultiple: 1 |
| 1388 | + AssertFree1ElementMultiple: 1 |
| 1389 | + AssertSummationElementMultiple: 1 |
| 1390 | + AssignedDerivedParameters: true |
| 1391 | + AssignedProblemIndependentDerivedParameters: true |
| 1392 | + BaseName: Cijk_Ailk_Bjlk_B8NB8NS_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x64_I5iHQbor2RtVIIizztpt5kW77h2OWmEhyzabuC8fjyQ= |
| 1393 | + BufferLoad: true |
| 1394 | + BufferStore: true |
| 1395 | + CUCount: null |
| 1396 | + CUOccupancy: -1 |
| 1397 | + ClusterLocalRead: 0 |
| 1398 | + CodeObjectVersion: '4' |
| 1399 | + ConvertAfterDS: 0 |
| 1400 | + CustomKernelName: '' |
| 1401 | + DebugStreamK: 0 |
| 1402 | + DepthU: 64 |
| 1403 | + DirectToLds: false |
| 1404 | + DirectToLdsA: false |
| 1405 | + DirectToLdsB: false |
| 1406 | + DirectToVgprA: 0 |
| 1407 | + DirectToVgprB: 0 |
| 1408 | + DirectToVgprSparseMetadata: false |
| 1409 | + EdgeType: ShiftPtr |
| 1410 | + EnableF32XEmulationLds: false |
| 1411 | + EnableF32XdlMathOp: false |
| 1412 | + EnableMatrixInstruction: true |
| 1413 | + ExpandPointerSwap: 0 |
| 1414 | + ExpertSchedulingMode: 0 |
| 1415 | + ForceDisableShadowInit: false |
| 1416 | + GlobalReadPerMfma: 0.1 |
| 1417 | + GlobalReadVectorWidthA: 16 |
| 1418 | + GlobalReadVectorWidthB: 16 |
| 1419 | + GlobalSplitU: 1 |
| 1420 | + GlobalSplitUAlgorithm: MultipleBuffer |
| 1421 | + GlobalSplitUCoalesced: false |
| 1422 | + GlobalSplitUWorkGroupMappingRoundRobin: false |
| 1423 | + GlobalWriteVectorWidth: 8 |
| 1424 | + GroupLoadStore: false |
| 1425 | + GuaranteeNoPartialA: false |
| 1426 | + GuaranteeNoPartialB: false |
| 1427 | + GuaranteeNoPartialMetadata: true |
| 1428 | + ISA: [9, 4, 2] |
| 1429 | + InnerUnroll: 1 |
| 1430 | + InterleaveAlpha: 0 |
| 1431 | + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, |
| 1432 | + SupportUserGSU: true, UseUniversalArgs: true} |
| 1433 | + Kernel: true |
| 1434 | + KernelLanguage: Assembly |
| 1435 | + KernelNameMin: Cijk_Ailk_Bjlk_B8NB8NS_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p10_GRVWA16_GRVWB16_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1 |
| 1436 | + LDSTrInst: false |
| 1437 | + LSCA: 256 |
| 1438 | + LSCB: 256 |
| 1439 | + LSPA: 16 |
| 1440 | + LSPB: 4 |
| 1441 | + LVCA: 16 |
| 1442 | + LVCB: 16 |
| 1443 | + LVPA: 1 |
| 1444 | + LVPB: 1 |
| 1445 | + LdsBlockSizePerPadA: 2048 |
| 1446 | + LdsBlockSizePerPadB: 2048 |
| 1447 | + LdsBlockSizePerPadMetadata: 0 |
| 1448 | + LdsBytesNoAmax: 32768 |
| 1449 | + LdsInitCVgprs: false |
| 1450 | + LdsNumBytes: 32768 |
| 1451 | + LdsNumElementsAlignedA: 16384 |
| 1452 | + LdsNumElementsAlignedB: 16384 |
| 1453 | + LdsNumElementsAlignedMetadata: 0 |
| 1454 | + LdsOffsetA: 0 |
| 1455 | + LdsOffsetA_Blk: 32768 |
| 1456 | + LdsOffsetB: 16384 |
| 1457 | + LdsOffsetB_Blk: 49152 |
| 1458 | + LdsOffsetBias: 0 |
| 1459 | + LdsOffsetBiasGSU: 0 |
| 1460 | + LdsOffsetBiasNonGSU: 0 |
| 1461 | + LdsOffsetMetadata: 32768 |
| 1462 | + LdsOffsetMetadata_Blk: 49152 |
| 1463 | + LdsPadA: 0 |
| 1464 | + LdsPadB: 0 |
| 1465 | + LdsPadMetadata: 0 |
| 1466 | + LocalReadVectorWidth: 8 |
| 1467 | + LocalSplitU: 1 |
| 1468 | + LocalSplitUReuseLDS: 1 |
| 1469 | + LocalWritePerMfma: -1 |
| 1470 | + LocalWriteUseSgprA: false |
| 1471 | + LocalWriteUseSgprB: false |
| 1472 | + LoopIters: 2 |
| 1473 | + LoopUnroll: 64 |
| 1474 | + MFMA_BF16_1K: false |
| 1475 | + MIArchVgpr: false |
| 1476 | + MIBlock: [16, 16, 32, 1, 1, 1] |
| 1477 | + MIInputPerThread: 8 |
| 1478 | + MIInputPerThreadA: 8 |
| 1479 | + MIInputPerThreadB: 8 |
| 1480 | + MIInputPerThreadMetadata: 8 |
| 1481 | + MIOutputVectorWidth: 4 |
| 1482 | + MIRegPerOut: 1 |
| 1483 | + MIWaveGroup: [2, 2] |
| 1484 | + MIWaveTile: [8, 8] |
| 1485 | + MIWaveTileA: 8 |
| 1486 | + MIWaveTileB: 8 |
| 1487 | + MIWaveTileMetadata: 0 |
| 1488 | + MacroTile0: 256 |
| 1489 | + MacroTile1: 256 |
| 1490 | + MacroTileA: 256 |
| 1491 | + MacroTileB: 256 |
| 1492 | + MagicDivAlg: 2 |
| 1493 | + MathClocksUnrolledLoop: 0 |
| 1494 | + MatrixInstB: 1 |
| 1495 | + MatrixInstBM: 1 |
| 1496 | + MatrixInstBN: 1 |
| 1497 | + MatrixInstK: 32 |
| 1498 | + MatrixInstM: 16 |
| 1499 | + MatrixInstN: 16 |
| 1500 | + MatrixInstruction: [16, 16, 32, 1] |
| 1501 | + MaxLDS: 65536 |
| 1502 | + MaxOccupancy: 40 |
| 1503 | + MbskPrefetchOpt: 0 |
| 1504 | + NoLdsWriteCode: false |
| 1505 | + NoReject: false |
| 1506 | + NoTailLoop: false |
| 1507 | + NonDTLTailLoopA: false |
| 1508 | + NonDTLTailLoopB: false |
| 1509 | + NonTemporal: -1 |
| 1510 | + NonTemporalA: 0 |
| 1511 | + NonTemporalB: 0 |
| 1512 | + NonTemporalC: 0 |
| 1513 | + NonTemporalD: 4 |
| 1514 | + NonTemporalE: 0 |
| 1515 | + NonTemporalMetadata: 0 |
| 1516 | + NonTemporalWS: 0 |
| 1517 | + NumElementsPerBatchStore: 2 |
| 1518 | + NumElementsPerThread: 256 |
| 1519 | + NumGlobalWriteVectorsPerThread: 32 |
| 1520 | + NumLoadsA: 4 |
| 1521 | + NumLoadsB: 4 |
| 1522 | + NumLoadsCoalescedA: 1 |
| 1523 | + NumLoadsCoalescedB: 1 |
| 1524 | + NumLoadsPerpendicularA: 4 |
| 1525 | + NumLoadsPerpendicularB: 4 |
| 1526 | + NumThreads: 256 |
| 1527 | + NumWaveSplitK: 1 |
| 1528 | + OptNoLoadLoop: 2 |
| 1529 | + PackedC0IdxChars: [I] |
| 1530 | + PackedC0IndicesX: [0] |
| 1531 | + PackedC1IdxChars: [J] |
| 1532 | + PackedC1IndicesX: [1] |
| 1533 | + PrefetchGlobalRead: 2 |
| 1534 | + PrefetchLocalRead: 1 |
| 1535 | + PreloadKernArgs: true |
| 1536 | + ProblemType: |
| 1537 | + Activation: true |
| 1538 | + ActivationComputeDataType: 0 |
| 1539 | + ActivationNoGuard: false |
| 1540 | + ActivationType: hipblaslt_all |
| 1541 | + AllowNoFreeDims: false |
| 1542 | + AssignedDerivedParameters: true |
| 1543 | + Batched: true |
| 1544 | + BetaOnlyUseBias: false |
| 1545 | + BiasDataTypeList: [0, 4, 7] |
| 1546 | + BiasSrc: D |
| 1547 | + ComplexConjugateA: false |
| 1548 | + ComplexConjugateB: false |
| 1549 | + ComputeDataType: 0 |
| 1550 | + DataType: 12 |
| 1551 | + DataTypeA: 12 |
| 1552 | + DataTypeAmaxD: 0 |
| 1553 | + DataTypeB: 12 |
| 1554 | + DataTypeE: 12 |
| 1555 | + DestDataType: 12 |
| 1556 | + F32XdlMathOp: 0 |
| 1557 | + Gradient: false |
| 1558 | + GroupedGemm: false |
| 1559 | + HighPrecisionAccumulate: true |
| 1560 | + Index0: 0 |
| 1561 | + Index01A: 0 |
| 1562 | + Index01B: 1 |
| 1563 | + Index1: 1 |
| 1564 | + IndexAssignmentsA: [0, 3, 2] |
| 1565 | + IndexAssignmentsB: [1, 3, 2] |
| 1566 | + IndexAssignmentsLD: [4, 5, 6, 7] |
| 1567 | + IndexAssignmentsMetadata: [3, 0, 2] |
| 1568 | + IndexUnroll: 3 |
| 1569 | + IndexUnrollA: 1 |
| 1570 | + IndexUnrollB: 1 |
| 1571 | + IndexUnrollM: 0 |
| 1572 | + IndicesBatch: [2] |
| 1573 | + IndicesFree: [0, 1] |
| 1574 | + IndicesSummation: [3] |
| 1575 | + MirrorDimsA: [] |
| 1576 | + MirrorDimsB: [] |
| 1577 | + MirrorDimsMetadata: [] |
| 1578 | + NumIndicesBatch: 1 |
| 1579 | + NumIndicesC: 3 |
| 1580 | + NumIndicesFree: 2 |
| 1581 | + NumIndicesLD: 4 |
| 1582 | + NumIndicesSummation: 1 |
| 1583 | + OperationType: GEMM |
| 1584 | + OutputAmaxD: false |
| 1585 | + SetConstStrideA: [] |
| 1586 | + SetConstStrideB: [] |
| 1587 | + SetConstStrideBias: [] |
| 1588 | + SilentHighPrecisionAccumulate: false |
| 1589 | + Sparse: 0 |
| 1590 | + StochasticRounding: false |
| 1591 | + StridedBatched: true |
| 1592 | + SupportUserArgs: true |
| 1593 | + SwizzleTensorA: false |
| 1594 | + SwizzleTensorB: false |
| 1595 | + TLUA: true |
| 1596 | + TLUB: true |
| 1597 | + Tensor0: 0 |
| 1598 | + Tensor1: 1 |
| 1599 | + TileA: 0 |
| 1600 | + TileAwareSelection: false |
| 1601 | + TileB: 1 |
| 1602 | + TotalIndices: 4 |
| 1603 | + TransposeA: 0 |
| 1604 | + TransposeB: 1 |
| 1605 | + UseBeta: true |
| 1606 | + UseBias: 1 |
| 1607 | + UseE: false |
| 1608 | + UseInitialStridesAB: false |
| 1609 | + UseInitialStridesCD: false |
| 1610 | + UseScaleAB: Scalar |
| 1611 | + UseScaleAlphaVec: 1 |
| 1612 | + UseScaleCD: true |
| 1613 | + ScheduleGlobalRead: 1 |
| 1614 | + ScheduleIterAlg: 3 |
| 1615 | + ScheduleLocalWrite: 1 |
| 1616 | + SolutionIndex: 5 |
| 1617 | + SolutionNameMin: Cijk_Ailk_Bjlk_B8NB8NS_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p10_GRVWA16_GRVWB16_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU64_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM12_WGMXCC4_WGMXCCGn1 |
| 1618 | + SourceSwap: 1 |
| 1619 | + StaggerU: 64 |
| 1620 | + StaggerUMapping: 1 |
| 1621 | + StaggerUStride: 64 |
| 1622 | + StorePriorityOpt: 0 |
| 1623 | + StoreRemapVectorWidth: 0 |
| 1624 | + StoreSwapAddr: false |
| 1625 | + StoreSyncOpt: 0 |
| 1626 | + StoreVectorWidth: 8 |
| 1627 | + StreamK: 0 |
| 1628 | + StreamKAtomic: 0 |
| 1629 | + StreamKXCCMapping: 0 |
| 1630 | + SubGroup0: 8 |
| 1631 | + SubGroup1: 32 |
| 1632 | + SubGroupA: 8 |
| 1633 | + SubGroupB: 32 |
| 1634 | + SuppressNoLoadLoop: false |
| 1635 | + ThreadTile: [1, 1] |
| 1636 | + ThreadTile0: 32 |
| 1637 | + ThreadTile1: 8 |
| 1638 | + ThreadTileA: 32 |
| 1639 | + ThreadTileB: 8 |
| 1640 | + TransposeLDS: 0 |
| 1641 | + TransposeLDSMetadata: true |
| 1642 | + ULSGRODoubleG2L: 0 |
| 1643 | + UnrollLoopSwapGlobalReadOrder: 0 |
| 1644 | + UnrollMajorLDSA: 0 |
| 1645 | + UnrollMajorLDSB: 0 |
| 1646 | + UnrollMajorLDSMetadata: true |
| 1647 | + Use64bShadowLimit: 1 |
| 1648 | + UseDotInstruction: false |
| 1649 | + UseF32XEmulation: false |
| 1650 | + UseInstOffsetForGRO: 0 |
| 1651 | + UseSgprForGRO: -1 |
| 1652 | + Valid: true |
| 1653 | + VectorStore: -1 |
| 1654 | + VectorWidthA: 8 |
| 1655 | + VectorWidthB: 8 |
| 1656 | + WaveSeparateGlobalReadA: 0 |
| 1657 | + WaveSeparateGlobalReadB: 2 |
| 1658 | + WaveSeparateGlobalReadMetadata: 0 |
| 1659 | + WaveSplitK: false |
| 1660 | + WavefrontSize: 64 |
| 1661 | + WorkGroup: [32, 8, 1] |
| 1662 | + WorkGroupMapping: 12 |
| 1663 | + WorkGroupMappingXCC: 4 |
| 1664 | + WorkGroupMappingXCCGroup: -1 |
| 1665 | + WorkGroupReduction: false |
| 1666 | + WorkspaceCheck: [4, 0, 1] |
| 1667 | + _DepthU: 64 |
| 1668 | + _DepthUA: 64 |
| 1669 | + _DepthUB: 64 |
| 1670 | + _DepthUMetadata: 64 |
| 1671 | + _GlobalAccumulation: MultipleBuffer |
| 1672 | + _UseSgprForGRO: false |
| 1673 | + _VectorStore: 1 |
| 1674 | + _WorkspaceSizePerElemBias: 0 |
| 1675 | + _WorkspaceSizePerElemC: 4 |
| 1676 | + _staggerStrideShift: 0 |
| 1677 | + enableLDSTrA: false |
| 1678 | + enableLDSTrB: false |
| 1679 | + reorderGRInstForDTVA: false |
| 1680 | + reorderGRInstForDTVB: false |
| 1681 | + tailLoopOptA: true |
| 1682 | + tailLoopOptB: true |
1381 | 1683 | - [2, 3, 0, 1]
|
1382 | 1684 | - - - [128, 128, 1, 128]
|
1383 | 1685 | - [1, 0.0]
|
|
1389 | 1691 | - [3, 0.0]
|
1390 | 1692 | - - [3, 3, 1, 3]
|
1391 | 1693 | - [4, 0.0]
|
| 1694 | + - - [34048, 34048, 1, 2560] |
| 1695 | + - [5, 0.0] |
1392 | 1696 | - null
|
1393 | 1697 | - null
|
1394 | 1698 | - DeviceEfficiency
|
|
0 commit comments