@@ -731,19 +731,19 @@ retry:
731731 }
732732 sort .Ints (irqs )
733733
734- general .Infof ("%s [before tidy] nic %s irq affinity:" , IrqTuningLogPrefix , nic )
734+ general .Infof ("%s [before tidy] nic %s irq affinity (irq -> cpu) :" , IrqTuningLogPrefix , nic )
735735 for _ , irq := range irqs {
736- cpuStr , _ := general .ConvertIntSliceToBitmapString (irq2CPUs [irq ])
737- general .Infof ("%s irq %d: cpu %s" , IrqTuningLogPrefix , irq , cpuStr )
736+ cpuStr := general .ConvertLinuxListToString (irq2CPUs [irq ])
737+ general .Infof ("%s %d -> %s" , IrqTuningLogPrefix , irq , cpuStr )
738738 }
739739
740740 irq2Core , err = machine .TidyUpNicIrqsAffinityCPUs (irq2CPUs )
741741 if err != nil {
742742 general .Errorf ("%s nic %s failed to TidyUpIrqsAffinityCPUs, err %v" , IrqTuningLogPrefix , nic , err )
743743 } else {
744- general .Infof ("%s [after tidy] nic %s irq affinity:" , IrqTuningLogPrefix , nic )
744+ general .Infof ("%s [after tidy] nic %s irq affinity (irq -> cpu) :" , IrqTuningLogPrefix , nic )
745745 for _ , irq := range irqs {
746- general .Infof ("%s irq %d: cpu %d" , IrqTuningLogPrefix , irq , irq2Core [irq ])
746+ general .Infof ("%s %d -> %d" , IrqTuningLogPrefix , irq , irq2Core [irq ])
747747 }
748748 }
749749
@@ -1018,7 +1018,31 @@ func (n *NicInfo) filterIrqCores(coresList []int64) []int64 {
10181018 return filteredIrqCores
10191019}
10201020
1021- func (n * NicInfo ) sync () error {
1021+ func (n * NicInfo ) updateIrqAffinity (changedIrq2Core map [int ]int64 ) error {
1022+ if len (changedIrq2Core ) == 0 {
1023+ return nil
1024+ }
1025+
1026+ for irq , core := range changedIrq2Core {
1027+ n .Irq2Core [irq ] = core
1028+ }
1029+
1030+ socketIrqCores , err := getSocketIrqCores (n .Irq2Core )
1031+ if err != nil {
1032+ return fmt .Errorf ("nic %s failed to getSocketIrqCores, err %s" , n , err )
1033+ }
1034+ n .SocketIrqCores = socketIrqCores
1035+
1036+ irqs := n .getIrqs ()
1037+ general .Infof ("%s after updateIrqAffinity, nic %s irq affinity (irq -> cpu):" , IrqTuningLogPrefix , n )
1038+ for _ , irq := range irqs {
1039+ general .Infof ("%s %d -> %d" , IrqTuningLogPrefix , irq , n .Irq2Core [irq ])
1040+ }
1041+
1042+ return nil
1043+ }
1044+
1045+ func (n * NicInfo ) syncIrqAffinityFromKernel () error {
10221046 nicInfo , err := GetNicInfo (n .NicBasicInfo )
10231047 if err != nil {
10241048 return fmt .Errorf ("failed to GetNicInfo for nic %s, err %v" , n , err )
@@ -1030,25 +1054,16 @@ func (n *NicInfo) sync() error {
10301054 return nil
10311055}
10321056
1033- func listActiveUplinkNicsExcludeSriovVFs (netNSDir string ) ([]* machine.NicBasicInfo , error ) {
1034- nics , err := machine .ListActiveUplinkNics (netNSDir )
1057+ func listHostActiveUplinkNics (netNSDir string ) ([]* machine.NicBasicInfo , error ) {
1058+ // names of container network namespaces (including those of SRIOV containers) are prefixed with "cni-",
1059+ // additionally, the NICs of SRIOV containers will be required during the periodic ListContainers process.
1060+ nics , err := machine .ListActiveUplinkNics (netNSDir , []string {machine .ContainerNetNSPrefix })
10351061 if err != nil {
10361062 return nil , err
10371063 }
10381064
1039- // filter out nics which are dedicated to sriov dedicated-cores containers from nics
1040- // sriov dedicated-cores container's nic's irq affinity will be tuned in initialize tuning and periodic tuning
1041- var tmpNics []* machine.NicBasicInfo
1042- for _ , nic := range nics {
1043- // all sriov netns's names hava prefix "cni-", sriov netns is managed by cni plugin
1044- if ! strings .HasPrefix (nic .NSName , "cni-" ) {
1045- tmpNics = append (tmpNics , nic )
1046- }
1047- }
1048- nics = tmpNics
1049-
10501065 if len (nics ) == 0 {
1051- return nil , fmt .Errorf ("no active uplink nics after filtering out sriov nics , it's impossible" )
1066+ return nil , fmt .Errorf ("no active uplink nics, it's impossible" )
10521067 }
10531068
10541069 // sort nics by ifindex
@@ -1629,9 +1644,11 @@ func (ic *IrqTuningController) getAllNics() []*NicIrqTuningManager {
16291644 return nics
16301645}
16311646
1632- func (ic * IrqTuningController ) emitErrMetric (reason string , level int64 ) {
1647+ func (ic * IrqTuningController ) emitErrMetric (reason string , level int64 , tags ... metrics.MetricTag ) {
1648+ allTags := []metrics.MetricTag {{Key : "reason" , Val : reason }}
1649+ allTags = append (allTags , tags ... )
16331650 _ = ic .emitter .StoreInt64 (metricUtil .MetricNameIrqTuningErr , level , metrics .MetricTypeNameRaw ,
1634- metrics. MetricTag { Key : "reason" , Val : reason } )
1651+ allTags ... )
16351652}
16361653
16371654func (ic * IrqTuningController ) emitIrqTuningPolicy () {
@@ -2142,7 +2159,7 @@ func (ic *IrqTuningController) classifyNicsByThroughput(oldIndicatorsStats *Indi
21422159func (ic * IrqTuningController ) syncNics () error {
21432160 general .Infof ("%s sync nics" , IrqTuningLogPrefix )
21442161
2145- nics , err := listActiveUplinkNicsExcludeSriovVFs (ic .agentConf .MachineInfoConfiguration .NetNSDirAbsPath )
2162+ nics , err := listHostActiveUplinkNics (ic .agentConf .MachineInfoConfiguration .NetNSDirAbsPath )
21462163 if err != nil {
21472164 return err
21482165 }
@@ -2160,7 +2177,7 @@ func (ic *IrqTuningController) syncNics() error {
21602177 }
21612178
21622179 if ! nicsChanged {
2163- // nics has been sorted by ifindex in listActiveUplinkNicsExcludeSriovVFs
2180+ // nics has been sorted by ifindex in listHostActiveUplinkNics
21642181 for i := range oldNics {
21652182 if ! nics [i ].Equal (oldNics [i ].NicInfo .NicBasicInfo ) {
21662183 nicsChanged = true
@@ -2171,6 +2188,35 @@ func (ic *IrqTuningController) syncNics() error {
21712188
21722189 if ! nicsChanged {
21732190 general .Infof ("%s no nic changed" , IrqTuningLogPrefix )
2191+
2192+ for _ , nic := range oldNics {
2193+ oldIrq2Core := nic .NicInfo .Irq2Core
2194+ if err := nic .NicInfo .syncIrqAffinityFromKernel (); err != nil {
2195+ general .Errorf ("%s nic %s failed to syncIrqAffinityFromKernel, err %s" , IrqTuningLogPrefix , nic .NicInfo , err )
2196+
2197+ ic .emitErrMetric (irqtuner .SyncIrqAffinityFromKernelFailed , irqtuner .IrqTuningError ,
2198+ metrics.MetricTag {Key : "nic" , Val : nic .NicInfo .UniqName ()})
2199+ }
2200+
2201+ for irq , oldCore := range oldIrq2Core {
2202+ newCore := nic .NicInfo .Irq2Core [irq ]
2203+ if newCore != oldCore {
2204+ throughputClass := "normal"
2205+ if ic .isLowThroughputNic (nic .NicInfo ) {
2206+ throughputClass = "low"
2207+ }
2208+
2209+ general .Errorf ("%s %s throughput nic %s irq %d expectly affinity cpu %d, but actually affinity cpu %d" ,
2210+ IrqTuningLogPrefix , throughputClass , nic .NicInfo , irq , oldCore , newCore )
2211+
2212+ _ = ic .emitter .StoreInt64 (metricUtil .MetricNameIrqTuningIrqAffintityInconsistent , 1 , metrics .MetricTypeNameRaw ,
2213+ metrics.MetricTag {Key : "nic" , Val : nic .NicInfo .UniqName ()},
2214+ metrics.MetricTag {Key : "irq" , Val : strconv .Itoa (irq )},
2215+ metrics.MetricTag {Key : "throughput" , Val : throughputClass })
2216+ }
2217+ }
2218+ }
2219+
21742220 return nil
21752221 }
21762222
@@ -2686,8 +2732,6 @@ func (ic *IrqTuningController) tuneNicIrqsAffinityQualifiedCores(nic *NicInfo, i
26862732 accountedIrqs [irq ] = struct {}{}
26872733 }
26882734
2689- hasIrqTuned := false
2690-
26912735 isSriovContainerNic := ic .isSriovContainerNic (nic )
26922736 if isSriovContainerNic {
26932737 // sriov nic's irqs are accounted in getCoresIrqCount
@@ -2696,6 +2740,7 @@ func (ic *IrqTuningController) tuneNicIrqsAffinityQualifiedCores(nic *NicInfo, i
26962740 }
26972741 }
26982742
2743+ changedIrq2Core := make (map [int ]int64 )
26992744 for _ , irq := range irqs {
27002745 core , ok := nic .Irq2Core [irq ]
27012746 if ! ok {
@@ -2734,25 +2779,33 @@ func (ic *IrqTuningController) tuneNicIrqsAffinityQualifiedCores(nic *NicInfo, i
27342779 if err := machine .SetIrqAffinity (irq , targetCore ); err != nil {
27352780 general .Errorf ("%s failed to SetIrqAffinity(%d, %d) for nic %s, err %v" ,
27362781 IrqTuningLogPrefix , irq , targetCore , nic , err )
2782+
2783+ _ = ic .emitter .StoreInt64 (metricUtil .MetricNameIrqTuningSetIrqAffinityFailed , 1 , metrics .MetricTypeNameRaw ,
2784+ metrics.MetricTag {Key : "nic" , Val : nic .UniqName ()},
2785+ metrics.MetricTag {Key : "irq" , Val : strconv .Itoa (irq )})
27372786 continue
27382787 }
2739- general .Infof ("%s nic %s set irq %d affinity cpu %d" , IrqTuningLogPrefix , nic , irq , targetCore )
2788+ general .Infof ("%s nic %s set irq %d affinity from cpu %d to cpu %d " , IrqTuningLogPrefix , nic , irq , core , targetCore )
27402789
27412790 // sriov nic's irqs are accounted in getCoresIrqCount, so here need to dec irq count from orignal core.
27422791 if _ , ok := accountedIrqs [irq ]; ok {
27432792 coresIrqCount [core ]--
27442793 }
27452794 coresIrqCount [targetCore ]++
27462795 accountedIrqs [irq ] = struct {}{}
2747- hasIrqTuned = true
2796+ changedIrq2Core [ irq ] = targetCore
27482797 }
27492798
27502799 ///////////////////////////////////////////////
27512800 // update nic.Irq2Core and nic.SocketIrqCores
27522801 ///////////////////////////////////////////////
2753- if hasIrqTuned {
2754- if err := nic .sync (); err != nil {
2755- general .Errorf ("%s failed to sync for nic %s, err %s" , IrqTuningLogPrefix , nic , err )
2802+ if len (changedIrq2Core ) > 0 {
2803+ if err := nic .updateIrqAffinity (changedIrq2Core ); err != nil {
2804+ general .Errorf ("%s failed to updateIrqAffinity(%+v) for nic %s, err %v" ,
2805+ IrqTuningLogPrefix , changedIrq2Core , nic , err )
2806+
2807+ ic .emitErrMetric (irqtuner .UpdateIrqAffinityFailed , irqtuner .IrqTuningError ,
2808+ metrics.MetricTag {Key : "nic" , Val : nic .UniqName ()})
27562809 }
27572810
27582811 if ic .isNormalThroughputNic (nic ) {
@@ -2769,6 +2822,16 @@ func (ic *IrqTuningController) tuneNicIrqsAffinityQualifiedCores(nic *NicInfo, i
27692822 tunedReason = irqtuner .NormalNicsChanged
27702823 } else {
27712824 tunedReason = irqtuner .UnexpectedTuning
2825+
2826+ general .Infof ("%s %s last tuned nics:" , IrqTuningLogPrefix , nic )
2827+ for _ , n := range nics {
2828+ general .Infof ("%s %s queueNum: %d" , IrqTuningLogPrefix , n , n .QueueNum )
2829+ }
2830+ general .Infof ("%s current nics:" , IrqTuningLogPrefix )
2831+ for _ , n := range currentNics {
2832+ general .Infof ("%s %s queueNum: %d" , IrqTuningLogPrefix , n , n .QueueNum )
2833+ }
2834+
27722835 general .Errorf ("%s nic %s unexpected balance-fair irq tuning, irqs: %+v, qualifiedCores: %+v" ,
27732836 IrqTuningLogPrefix , nic , irqs , qualifiedCoresMap )
27742837 }
@@ -3002,7 +3065,7 @@ func (ic *IrqTuningController) balanceNicIrqsInCoresFairly(nic *NicInfo, irqs []
30023065 general .Errorf ("%s nic %s failed to SetIrqAffinity(%d, %d), err %v" , IrqTuningLogPrefix , nic , irq , targetCore , err )
30033066 continue
30043067 }
3005- general .Infof ("%s nic %s set irq %d affinity cpu %d" , IrqTuningLogPrefix , nic , irq , targetCore )
3068+ general .Infof ("%s nic %s set irq %d affinity from cpu %d to cpu %d " , IrqTuningLogPrefix , nic , irq , oriCore , targetCore )
30063069
30073070 coresIrqCount [oriCore ]--
30083071 coresIrqCount [targetCore ]++
@@ -3011,6 +3074,19 @@ func (ic *IrqTuningController) balanceNicIrqsInCoresFairly(nic *NicInfo, irqs []
30113074 changedIrq2Core [irq ] = targetCore
30123075 }
30133076
3077+ if len (changedIrq2Core ) > 0 {
3078+ if err := nic .updateIrqAffinity (changedIrq2Core ); err != nil {
3079+ general .Errorf ("%s failed to updateIrqAffinity(%+v) for nic %s, err %v" ,
3080+ IrqTuningLogPrefix , changedIrq2Core , nic , err )
3081+
3082+ ic .emitErrMetric (irqtuner .UpdateIrqAffinityFailed , irqtuner .IrqTuningError ,
3083+ metrics.MetricTag {Key : "nic" , Val : nic .UniqName ()})
3084+ }
3085+
3086+ // clear changedIrq2Core
3087+ changedIrq2Core = make (map [int ]int64 )
3088+ }
3089+
30143090 // make sure no qualified core's irq count less-than round down avg core irq count.
30153091 // if there is a qualified core's irq count less-than round down avg core irq count, then find one qualified core from parameter
30163092 // irqs affinity cores whose irq count - this irq count greater-equal 2,
@@ -3084,7 +3160,7 @@ func (ic *IrqTuningController) balanceNicIrqsInCoresFairly(nic *NicInfo, irqs []
30843160 general .Errorf ("%s failed to SetIrqAffinity(%d, %d), err %v" , IrqTuningLogPrefix , targetIrq , core , err )
30853161 continue
30863162 }
3087- general .Infof ("%s nic %s set irq %d affinity cpu %d" , IrqTuningLogPrefix , nic , targetIrq , core )
3163+ general .Infof ("%s nic %s set irq %d affinity from cpu %d to cpu %d " , IrqTuningLogPrefix , nic , targetIrq , srcCore , core )
30883164
30893165 coresIrqCount [srcCore ]--
30903166 coresIrqCount [core ]++
@@ -3093,25 +3169,14 @@ func (ic *IrqTuningController) balanceNicIrqsInCoresFairly(nic *NicInfo, irqs []
30933169 changedIrq2Core [targetIrq ] = core
30943170 }
30953171
3096- if len (changedIrq2Core ) == 0 {
3097- return nil
3098- }
3099-
3100- // update nic.Irq2Core and nic.SocketIrqCores, just in case nic.sync failed
3101- for irq , core := range changedIrq2Core {
3102- nic .Irq2Core [irq ] = core
3103- }
3104-
3105- socketIrqCores , err := getSocketIrqCores (nic .Irq2Core )
3106- if err != nil {
3107- general .Errorf ("%s nic %s failed to getSocketIrqCores, err %s" , IrqTuningLogPrefix , nic , err )
3108- } else {
3109- nic .SocketIrqCores = socketIrqCores
3110- }
3172+ if len (changedIrq2Core ) >= 0 {
3173+ if err := nic .updateIrqAffinity (changedIrq2Core ); err != nil {
3174+ general .Errorf ("%s failed to updateIrqAffinity(%+v) for nic %s, err %v" ,
3175+ IrqTuningLogPrefix , changedIrq2Core , nic , err )
31113176
3112- // update nic info
3113- if err := nic .sync (); err != nil {
3114- general . Errorf ( "%s failed to sync nic %s, err %v" , IrqTuningLogPrefix , nic , err )
3177+ ic . emitErrMetric ( irqtuner . UpdateIrqAffinityFailed , irqtuner . IrqTuningError ,
3178+ metrics. MetricTag { Key : "nic" , Val : nic .UniqName ()})
3179+ }
31153180 }
31163181
31173182 return nil
@@ -3447,7 +3512,7 @@ func (ic *IrqTuningController) getNicsIfSRIOVContainer(cnt *irqtuner.ContainerIn
34473512 }
34483513
34493514 // all sriov netns's names hava prefix "cni-", sriov netns is managed by cni plugin
3450- if ! strings . HasPrefix (containerNetNSInfo .NSName , "cni-" ) {
3515+ if ! machine . IsContainerNetNS (containerNetNSInfo .NSName ) {
34513516 return false , nil
34523517 }
34533518
@@ -4154,24 +4219,30 @@ func (ic *IrqTuningController) balanceIrqs(nic *NicIrqTuningManager, srcIrqCore
41544219 return nil , nil
41554220 }
41564221
4222+ changedIrq2Core := make (map [int ]int64 )
41574223 irqsAffinityTuning := make (map [int ]* IrqAffinityTuning )
41584224 for _ , irq := range irqs {
41594225 if err := machine .SetIrqAffinity (irq , destIrqCore .CpuID ); err != nil {
41604226 general .Errorf ("%s nic %s failed to SetIrqAffinity(%d, %d), err %v" , IrqTuningLogPrefix , nic .NicInfo , irq , destIrqCore .CpuID , err )
41614227 continue
41624228 }
4163- general .Infof ("%s nic %s set irq %d affinity cpu %d" , IrqTuningLogPrefix , nic .NicInfo , irq , destIrqCore .CpuID )
4229+ general .Infof ("%s nic %s set irq %d affinity from cpu %d to cpu %d " , IrqTuningLogPrefix , nic .NicInfo , irq , srcIrqCore . CpuID , destIrqCore .CpuID )
41644230
4165- nic . NicInfo . Irq2Core [irq ] = destIrqCore .CpuID
4231+ changedIrq2Core [irq ] = destIrqCore .CpuID
41664232 irqsAffinityTuning [irq ] = & IrqAffinityTuning {
41674233 SourceCore : srcIrqCore .CpuID ,
41684234 DestCore : destIrqCore .CpuID ,
41694235 }
4170- general .Infof ("%s nic %s tuning irq %d affinity from cpu %d to cpu %d" , IrqTuningLogPrefix , nic .NicInfo , irq , srcIrqCore .CpuID , destIrqCore .CpuID )
41714236 }
41724237
4173- if err := nic .NicInfo .sync (); err != nil {
4174- general .Errorf ("%s failed to sync for nic %s, err %s" , IrqTuningLogPrefix , nic .NicInfo , err )
4238+ if len (changedIrq2Core ) > 0 {
4239+ if err := nic .NicInfo .updateIrqAffinity (changedIrq2Core ); err != nil {
4240+ general .Errorf ("%s failed to updateIrqAffinity(%+v) for nic %s, err %v" ,
4241+ IrqTuningLogPrefix , changedIrq2Core , nic .NicInfo , err )
4242+
4243+ ic .emitErrMetric (irqtuner .UpdateIrqAffinityFailed , irqtuner .IrqTuningError ,
4244+ metrics.MetricTag {Key : "nic" , Val : nic .NicInfo .UniqName ()})
4245+ }
41754246 }
41764247
41774248 return irqsAffinityTuning , nil
@@ -4589,6 +4660,7 @@ func (ic *IrqTuningController) balanceIrqsToOtherExclusiveIrqCores(nic *NicIrqTu
45894660 cpusPPSBuffer [cpu ] = uint64 (ppsBuffer )
45904661 }
45914662
4663+ changedIrq2Core := make (map [int ]int64 )
45924664 for _ , queuePPS := range srcCoresQueuesPPSInDecOrder {
45934665 irq , ok := nic .NicInfo .Queue2Irq [queuePPS .QueueID ]
45944666 if ! ok {
@@ -4613,13 +4685,20 @@ func (ic *IrqTuningController) balanceIrqsToOtherExclusiveIrqCores(nic *NicIrqTu
46134685 general .Errorf ("%s nic %s failed to SetIrqAffinity(%d, %d), err %v" , IrqTuningLogPrefix , nic .NicInfo , irq , maxPPSBufferCore , err )
46144686 continue
46154687 }
4616- general .Infof ("%s nic %s set irq %d affinity cpu %d" , IrqTuningLogPrefix , nic .NicInfo , irq , maxPPSBufferCore )
4688+ general .Infof ("%s nic %s set irq %d affinity from cpu %d to cpu %d " , IrqTuningLogPrefix , nic .NicInfo , irq , nic . NicInfo . Irq2Core [ irq ] , maxPPSBufferCore )
46174689
46184690 cpusPPSBuffer [maxPPSBufferCore ] = maxPSSBuffer - queuePPS .PPS
4691+ changedIrq2Core [irq ] = maxPPSBufferCore
46194692 }
46204693
4621- if err := nic .NicInfo .sync (); err != nil {
4622- general .Errorf ("%s failed to sync for nic %s, err %s" , IrqTuningLogPrefix , nic .NicInfo , err )
4694+ if len (changedIrq2Core ) > 0 {
4695+ if err := nic .NicInfo .updateIrqAffinity (changedIrq2Core ); err != nil {
4696+ general .Errorf ("%s failed to updateIrqAffinity(%+v) for nic %s, err %v" ,
4697+ IrqTuningLogPrefix , changedIrq2Core , nic .NicInfo , err )
4698+
4699+ ic .emitErrMetric (irqtuner .UpdateIrqAffinityFailed , irqtuner .IrqTuningError ,
4700+ metrics.MetricTag {Key : "nic" , Val : nic .NicInfo .UniqName ()})
4701+ }
46234702 }
46244703
46254704 return nil
0 commit comments