Skip to content

Commit 1575885

Browse files
committed
Merge branch 'develop' of github.com:pkamd/hipBLASLt into remove_kernel
2 parents c1e4d5d + f5259ca commit 1575885

25 files changed

+41501
-181
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ if( BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS )
323323
message(STATUS "OS: ${CLIENTS_OS} ${CLIENTS_OS_VERSION}")
324324
set(GFORTRAN_RPM "libgfortran4")
325325
set(GFORTRAN_DEB "libgfortran4")
326-
if(CLIENTS_OS STREQUAL "centos" OR CLIENTS_OS STREQUAL "rhel")
326+
if(CLIENTS_OS STREQUAL "centos" OR CLIENTS_OS STREQUAL "rhel" OR CLIENTS_OS STREQUAL "almalinux")
327327
if(CLIENTS_OS_VERSION VERSION_GREATER_EQUAL "8")
328328
set(GFORTRAN_RPM "libgfortran")
329329
endif()

clients/gtest/matmul_gtest.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1923,6 +1923,44 @@ Tests:
19231923
unit_check: 1
19241924
gpu_arch: '942'
19251925

1926+
- name: matmul_large_nt_f8_bf8_dst_f8_bf8_f16_bf16_f32_gfx942
1927+
category: pre_checkin
1928+
function:
1929+
matmul:
1930+
- *f8_fnuz_precision_dst_f8_fnuz
1931+
- *f8_fnuz_precision_dst_fp16
1932+
- *f8_fnuz_precision_dst_bf16
1933+
- *f8_fnuz_precision_dst_fp32
1934+
- *bf8_fnuz_precision_dst_bf8_fnuz
1935+
- *bf8_fnuz_precision_dst_fp16
1936+
- *bf8_fnuz_precision_dst_bf16
1937+
- *bf8_fnuz_precision_dst_fp32
1938+
matrix_size:
1939+
- { M: 4096, N: 4096, K: 2048 }
1940+
transA: N
1941+
transB: T
1942+
alpha: [1.0, 2.0]
1943+
beta: [ 0.0, 2.0 ]
1944+
scaleA: [ 0, 1]
1945+
scaleB: [ 0, 1]
1946+
bias_vector: [0, 1]
1947+
bias_type: f32_r
1948+
unit_check: 1
1949+
gpu_arch: '942'
1950+
1951+
- name: matmul_large_nt_f64_dst_f64_gfx942
1952+
category: pre_checkin
1953+
function:
1954+
matmul: *double_precision
1955+
matrix_size:
1956+
- { M: 4096, N: 4096, K: 2048 }
1957+
transA: N
1958+
transB: T
1959+
alpha: [1.0, 2.0]
1960+
beta: [ 0.0, 2.0 ]
1961+
unit_check: 1
1962+
gpu_arch: '942'
1963+
19261964
- name: matmul_mx_datatypes
19271965
category: pre_checkin
19281966
function:

library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/GridBased/aquavanjaram_Cijk_Ailk_Bjlk_B8NB8NS_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,308 @@
13781378
_WorkspaceSizePerElemBias: 0
13791379
_WorkspaceSizePerElemC: 4
13801380
_staggerStrideShift: 3
1381+
- 1LDSBuffer: 1
1382+
ActivationAlt: false
1383+
ActivationFuncCall: false
1384+
ActivationFused: true
1385+
AssertAIGreaterThanEqual: -1
1386+
AssertAILessThanEqual: -1
1387+
AssertFree0ElementMultiple: 1
1388+
AssertFree1ElementMultiple: 1
1389+
AssertSummationElementMultiple: 1
1390+
AssignedDerivedParameters: true
1391+
AssignedProblemIndependentDerivedParameters: true
1392+
BaseName: Cijk_Ailk_Bjlk_B8NB8NS_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x64_I5iHQbor2RtVIIizztpt5kW77h2OWmEhyzabuC8fjyQ=
1393+
BufferLoad: true
1394+
BufferStore: true
1395+
CUCount: null
1396+
CUOccupancy: -1
1397+
ClusterLocalRead: 0
1398+
CodeObjectVersion: '4'
1399+
ConvertAfterDS: 0
1400+
CustomKernelName: ''
1401+
DebugStreamK: 0
1402+
DepthU: 64
1403+
DirectToLds: false
1404+
DirectToLdsA: false
1405+
DirectToLdsB: false
1406+
DirectToVgprA: 0
1407+
DirectToVgprB: 0
1408+
DirectToVgprSparseMetadata: false
1409+
EdgeType: ShiftPtr
1410+
EnableF32XEmulationLds: false
1411+
EnableF32XdlMathOp: false
1412+
EnableMatrixInstruction: true
1413+
ExpandPointerSwap: 0
1414+
ExpertSchedulingMode: 0
1415+
ForceDisableShadowInit: false
1416+
GlobalReadPerMfma: 0.1
1417+
GlobalReadVectorWidthA: 16
1418+
GlobalReadVectorWidthB: 16
1419+
GlobalSplitU: 1
1420+
GlobalSplitUAlgorithm: MultipleBuffer
1421+
GlobalSplitUCoalesced: false
1422+
GlobalSplitUWorkGroupMappingRoundRobin: false
1423+
GlobalWriteVectorWidth: 8
1424+
GroupLoadStore: false
1425+
GuaranteeNoPartialA: false
1426+
GuaranteeNoPartialB: false
1427+
GuaranteeNoPartialMetadata: true
1428+
ISA: [9, 4, 2]
1429+
InnerUnroll: 1
1430+
InterleaveAlpha: 0
1431+
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
1432+
SupportUserGSU: true, UseUniversalArgs: true}
1433+
Kernel: true
1434+
KernelLanguage: Assembly
1435+
KernelNameMin: Cijk_Ailk_Bjlk_B8NB8NS_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p10_GRVWA16_GRVWB16_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1
1436+
LDSTrInst: false
1437+
LSCA: 256
1438+
LSCB: 256
1439+
LSPA: 16
1440+
LSPB: 4
1441+
LVCA: 16
1442+
LVCB: 16
1443+
LVPA: 1
1444+
LVPB: 1
1445+
LdsBlockSizePerPadA: 2048
1446+
LdsBlockSizePerPadB: 2048
1447+
LdsBlockSizePerPadMetadata: 0
1448+
LdsBytesNoAmax: 32768
1449+
LdsInitCVgprs: false
1450+
LdsNumBytes: 32768
1451+
LdsNumElementsAlignedA: 16384
1452+
LdsNumElementsAlignedB: 16384
1453+
LdsNumElementsAlignedMetadata: 0
1454+
LdsOffsetA: 0
1455+
LdsOffsetA_Blk: 32768
1456+
LdsOffsetB: 16384
1457+
LdsOffsetB_Blk: 49152
1458+
LdsOffsetBias: 0
1459+
LdsOffsetBiasGSU: 0
1460+
LdsOffsetBiasNonGSU: 0
1461+
LdsOffsetMetadata: 32768
1462+
LdsOffsetMetadata_Blk: 49152
1463+
LdsPadA: 0
1464+
LdsPadB: 0
1465+
LdsPadMetadata: 0
1466+
LocalReadVectorWidth: 8
1467+
LocalSplitU: 1
1468+
LocalSplitUReuseLDS: 1
1469+
LocalWritePerMfma: -1
1470+
LocalWriteUseSgprA: false
1471+
LocalWriteUseSgprB: false
1472+
LoopIters: 2
1473+
LoopUnroll: 64
1474+
MFMA_BF16_1K: false
1475+
MIArchVgpr: false
1476+
MIBlock: [16, 16, 32, 1, 1, 1]
1477+
MIInputPerThread: 8
1478+
MIInputPerThreadA: 8
1479+
MIInputPerThreadB: 8
1480+
MIInputPerThreadMetadata: 8
1481+
MIOutputVectorWidth: 4
1482+
MIRegPerOut: 1
1483+
MIWaveGroup: [2, 2]
1484+
MIWaveTile: [8, 8]
1485+
MIWaveTileA: 8
1486+
MIWaveTileB: 8
1487+
MIWaveTileMetadata: 0
1488+
MacroTile0: 256
1489+
MacroTile1: 256
1490+
MacroTileA: 256
1491+
MacroTileB: 256
1492+
MagicDivAlg: 2
1493+
MathClocksUnrolledLoop: 0
1494+
MatrixInstB: 1
1495+
MatrixInstBM: 1
1496+
MatrixInstBN: 1
1497+
MatrixInstK: 32
1498+
MatrixInstM: 16
1499+
MatrixInstN: 16
1500+
MatrixInstruction: [16, 16, 32, 1]
1501+
MaxLDS: 65536
1502+
MaxOccupancy: 40
1503+
MbskPrefetchOpt: 0
1504+
NoLdsWriteCode: false
1505+
NoReject: false
1506+
NoTailLoop: false
1507+
NonDTLTailLoopA: false
1508+
NonDTLTailLoopB: false
1509+
NonTemporal: -1
1510+
NonTemporalA: 0
1511+
NonTemporalB: 0
1512+
NonTemporalC: 0
1513+
NonTemporalD: 4
1514+
NonTemporalE: 0
1515+
NonTemporalMetadata: 0
1516+
NonTemporalWS: 0
1517+
NumElementsPerBatchStore: 2
1518+
NumElementsPerThread: 256
1519+
NumGlobalWriteVectorsPerThread: 32
1520+
NumLoadsA: 4
1521+
NumLoadsB: 4
1522+
NumLoadsCoalescedA: 1
1523+
NumLoadsCoalescedB: 1
1524+
NumLoadsPerpendicularA: 4
1525+
NumLoadsPerpendicularB: 4
1526+
NumThreads: 256
1527+
NumWaveSplitK: 1
1528+
OptNoLoadLoop: 2
1529+
PackedC0IdxChars: [I]
1530+
PackedC0IndicesX: [0]
1531+
PackedC1IdxChars: [J]
1532+
PackedC1IndicesX: [1]
1533+
PrefetchGlobalRead: 2
1534+
PrefetchLocalRead: 1
1535+
PreloadKernArgs: true
1536+
ProblemType:
1537+
Activation: true
1538+
ActivationComputeDataType: 0
1539+
ActivationNoGuard: false
1540+
ActivationType: hipblaslt_all
1541+
AllowNoFreeDims: false
1542+
AssignedDerivedParameters: true
1543+
Batched: true
1544+
BetaOnlyUseBias: false
1545+
BiasDataTypeList: [0, 4, 7]
1546+
BiasSrc: D
1547+
ComplexConjugateA: false
1548+
ComplexConjugateB: false
1549+
ComputeDataType: 0
1550+
DataType: 12
1551+
DataTypeA: 12
1552+
DataTypeAmaxD: 0
1553+
DataTypeB: 12
1554+
DataTypeE: 12
1555+
DestDataType: 12
1556+
F32XdlMathOp: 0
1557+
Gradient: false
1558+
GroupedGemm: false
1559+
HighPrecisionAccumulate: true
1560+
Index0: 0
1561+
Index01A: 0
1562+
Index01B: 1
1563+
Index1: 1
1564+
IndexAssignmentsA: [0, 3, 2]
1565+
IndexAssignmentsB: [1, 3, 2]
1566+
IndexAssignmentsLD: [4, 5, 6, 7]
1567+
IndexAssignmentsMetadata: [3, 0, 2]
1568+
IndexUnroll: 3
1569+
IndexUnrollA: 1
1570+
IndexUnrollB: 1
1571+
IndexUnrollM: 0
1572+
IndicesBatch: [2]
1573+
IndicesFree: [0, 1]
1574+
IndicesSummation: [3]
1575+
MirrorDimsA: []
1576+
MirrorDimsB: []
1577+
MirrorDimsMetadata: []
1578+
NumIndicesBatch: 1
1579+
NumIndicesC: 3
1580+
NumIndicesFree: 2
1581+
NumIndicesLD: 4
1582+
NumIndicesSummation: 1
1583+
OperationType: GEMM
1584+
OutputAmaxD: false
1585+
SetConstStrideA: []
1586+
SetConstStrideB: []
1587+
SetConstStrideBias: []
1588+
SilentHighPrecisionAccumulate: false
1589+
Sparse: 0
1590+
StochasticRounding: false
1591+
StridedBatched: true
1592+
SupportUserArgs: true
1593+
SwizzleTensorA: false
1594+
SwizzleTensorB: false
1595+
TLUA: true
1596+
TLUB: true
1597+
Tensor0: 0
1598+
Tensor1: 1
1599+
TileA: 0
1600+
TileAwareSelection: false
1601+
TileB: 1
1602+
TotalIndices: 4
1603+
TransposeA: 0
1604+
TransposeB: 1
1605+
UseBeta: true
1606+
UseBias: 1
1607+
UseE: false
1608+
UseInitialStridesAB: false
1609+
UseInitialStridesCD: false
1610+
UseScaleAB: Scalar
1611+
UseScaleAlphaVec: 1
1612+
UseScaleCD: true
1613+
ScheduleGlobalRead: 1
1614+
ScheduleIterAlg: 3
1615+
ScheduleLocalWrite: 1
1616+
SolutionIndex: 5
1617+
SolutionNameMin: Cijk_Ailk_Bjlk_B8NB8NS_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p10_GRVWA16_GRVWB16_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU64_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM12_WGMXCC4_WGMXCCGn1
1618+
SourceSwap: 1
1619+
StaggerU: 64
1620+
StaggerUMapping: 1
1621+
StaggerUStride: 64
1622+
StorePriorityOpt: 0
1623+
StoreRemapVectorWidth: 0
1624+
StoreSwapAddr: false
1625+
StoreSyncOpt: 0
1626+
StoreVectorWidth: 8
1627+
StreamK: 0
1628+
StreamKAtomic: 0
1629+
StreamKXCCMapping: 0
1630+
SubGroup0: 8
1631+
SubGroup1: 32
1632+
SubGroupA: 8
1633+
SubGroupB: 32
1634+
SuppressNoLoadLoop: false
1635+
ThreadTile: [1, 1]
1636+
ThreadTile0: 32
1637+
ThreadTile1: 8
1638+
ThreadTileA: 32
1639+
ThreadTileB: 8
1640+
TransposeLDS: 0
1641+
TransposeLDSMetadata: true
1642+
ULSGRODoubleG2L: 0
1643+
UnrollLoopSwapGlobalReadOrder: 0
1644+
UnrollMajorLDSA: 0
1645+
UnrollMajorLDSB: 0
1646+
UnrollMajorLDSMetadata: true
1647+
Use64bShadowLimit: 1
1648+
UseDotInstruction: false
1649+
UseF32XEmulation: false
1650+
UseInstOffsetForGRO: 0
1651+
UseSgprForGRO: -1
1652+
Valid: true
1653+
VectorStore: -1
1654+
VectorWidthA: 8
1655+
VectorWidthB: 8
1656+
WaveSeparateGlobalReadA: 0
1657+
WaveSeparateGlobalReadB: 2
1658+
WaveSeparateGlobalReadMetadata: 0
1659+
WaveSplitK: false
1660+
WavefrontSize: 64
1661+
WorkGroup: [32, 8, 1]
1662+
WorkGroupMapping: 12
1663+
WorkGroupMappingXCC: 4
1664+
WorkGroupMappingXCCGroup: -1
1665+
WorkGroupReduction: false
1666+
WorkspaceCheck: [4, 0, 1]
1667+
_DepthU: 64
1668+
_DepthUA: 64
1669+
_DepthUB: 64
1670+
_DepthUMetadata: 64
1671+
_GlobalAccumulation: MultipleBuffer
1672+
_UseSgprForGRO: false
1673+
_VectorStore: 1
1674+
_WorkspaceSizePerElemBias: 0
1675+
_WorkspaceSizePerElemC: 4
1676+
_staggerStrideShift: 0
1677+
enableLDSTrA: false
1678+
enableLDSTrB: false
1679+
reorderGRInstForDTVA: false
1680+
reorderGRInstForDTVB: false
1681+
tailLoopOptA: true
1682+
tailLoopOptB: true
13811683
- [2, 3, 0, 1]
13821684
- - - [128, 128, 1, 128]
13831685
- [1, 0.0]
@@ -1389,6 +1691,8 @@
13891691
- [3, 0.0]
13901692
- - [3, 3, 1, 3]
13911693
- [4, 0.0]
1694+
- - [34048, 34048, 1, 2560]
1695+
- [5, 0.0]
13921696
- null
13931697
- null
13941698
- DeviceEfficiency

0 commit comments

Comments
 (0)