Skip to content

Commit a1d945b

Browse files
committed
fix(network): extract net class cleanup logic to a separate method
Move the cgroup net class cleanup logic from `removePod` to a new `clearNetClassIfNeed` method to avoid network plugin state residue.
1 parent 5bb737c commit a1d945b

File tree

2 files changed

+154
-18
lines changed

2 files changed

+154
-18
lines changed

pkg/agent/qrm-plugins/network/staticpolicy/policy.go

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,24 +1170,9 @@ func (p *StaticPolicy) getResourceAllocationAnnotations(
11701170
}
11711171

11721172
func (p *StaticPolicy) removePod(podUID string) error {
1173-
if p.CgroupV2Env {
1174-
cgIDList, err := p.metaServer.ExternalManager.ListCgroupIDsForPod(podUID)
1175-
if err != nil {
1176-
if general.IsErrNotFound(err) {
1177-
general.Warningf("cgroup ids for pod not found")
1178-
return nil
1179-
}
1180-
return fmt.Errorf("[NetworkStaticPolicy.removePod] list cgroup ids of pod: %s failed with error: %v", podUID, err)
1181-
}
1182-
1183-
for _, cgID := range cgIDList {
1184-
go func(cgID uint64) {
1185-
if err := p.metaServer.ExternalManager.ClearNetClass(cgID); err != nil {
1186-
general.Errorf("delete net class failed, cgID: %v, err: %v", cgID, err)
1187-
return
1188-
}
1189-
}(cgID)
1190-
}
1173+
err := p.clearNetClassIfNeed(podUID)
1174+
if err != nil {
1175+
return fmt.Errorf("clearNetClassIfNeed failed with error: %v", err)
11911176
}
11921177

11931178
// update state cache
@@ -1370,6 +1355,29 @@ func (p *StaticPolicy) generateAndApplyGroups() error {
13701355
return nil
13711356
}
13721357

1358+
func (p *StaticPolicy) clearNetClassIfNeed(podUID string) error {
1359+
if p.CgroupV2Env {
1360+
cgIDList, err := p.metaServer.ExternalManager.ListCgroupIDsForPod(podUID)
1361+
if err != nil {
1362+
if general.IsErrNotFound(err) {
1363+
general.Warningf("cgroup ids for pod %s not found", podUID)
1364+
return nil
1365+
}
1366+
return fmt.Errorf("list cgroup ids of pod: %s failed with error: %v", podUID, err)
1367+
}
1368+
1369+
for _, cgID := range cgIDList {
1370+
go func(cgID uint64) {
1371+
if err := p.metaServer.ExternalManager.ClearNetClass(cgID); err != nil {
1372+
general.Errorf("delete net class failed, cgID: %v, err: %v", cgID, err)
1373+
return
1374+
}
1375+
}(cgID)
1376+
}
1377+
}
1378+
return nil
1379+
}
1380+
13731381
func getAllNICs(nicManager nic.NICManager) []machine.InterfaceInfo {
13741382
nics := nicManager.GetNICs()
13751383
return append(nics.HealthyNICs, nics.UnhealthyNICs...)

pkg/agent/qrm-plugins/network/staticpolicy/policy_test.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ import (
5555
"github.com/kubewharf/katalyst-core/pkg/metrics"
5656
"github.com/kubewharf/katalyst-core/pkg/util/cgroup/common"
5757
"github.com/kubewharf/katalyst-core/pkg/util/external/network"
58+
"github.com/kubewharf/katalyst-core/pkg/util/general"
5859
"github.com/kubewharf/katalyst-core/pkg/util/machine"
5960
)
6061

@@ -386,6 +387,8 @@ func TestRemovePod(t *testing.T) {
386387
PodUid: podID,
387388
}
388389

390+
//---verify cgroup id not found
391+
policy.CgroupV2Env = true
389392
_, err = policy.RemovePod(context.TODO(), delReq)
390393
assert.NoError(t, err)
391394

@@ -1801,3 +1804,128 @@ func TestStaticPolicy_applyNetClass(t *testing.T) {
18011804

18021805
policy.applyNetClass()
18031806
}
1807+
1808+
type errCgroupIDManager struct {
1809+
cgroupid.CgroupIDManagerStub
1810+
errListCgroupIDsForPod error
1811+
}
1812+
1813+
func (f *errCgroupIDManager) ListCgroupIDsForPod(_ string) ([]uint64, error) {
1814+
return nil, f.errListCgroupIDsForPod
1815+
}
1816+
1817+
type errNetworkManager struct {
1818+
network.NetworkManagerStub
1819+
errClearNetClass error
1820+
}
1821+
1822+
func (n *errNetworkManager) ClearNetClass(_ uint64) error {
1823+
return n.errClearNetClass
1824+
}
1825+
1826+
func TestStaticPolicy_clearNetClassIfNeed(t *testing.T) {
1827+
t.Parallel()
1828+
type fields struct {
1829+
cgroupIDManager cgroupid.CgroupIDManager
1830+
networkManager network.NetworkManager
1831+
}
1832+
type args struct {
1833+
podUID string
1834+
}
1835+
tests := []struct {
1836+
name string
1837+
fields fields
1838+
args args
1839+
wantErr assert.ErrorAssertionFunc
1840+
}{
1841+
{
1842+
name: "clear net class success",
1843+
fields: fields{
1844+
cgroupIDManager: &cgroupid.CgroupIDManagerStub{
1845+
ContainerCGroupIDMap: map[string]map[string]uint64{
1846+
"test-pod-uid": {
1847+
"test-container-id": 314125,
1848+
},
1849+
},
1850+
},
1851+
networkManager: &network.NetworkManagerStub{
1852+
NetClassMap: map[string]map[string]*common.NetClsData{
1853+
"test-pod-uid": {
1854+
"test-container-id": {
1855+
ClassID: 0,
1856+
CgroupID: 314125,
1857+
Attributes: map[string]string{
1858+
testNetClassIDResourceAllocationAnnotationKey: testSharedNetClsId,
1859+
},
1860+
},
1861+
},
1862+
},
1863+
},
1864+
},
1865+
args: args{
1866+
podUID: "test-pod-uid",
1867+
},
1868+
wantErr: assert.NoError,
1869+
},
1870+
{
1871+
name: "list cgroup id for pod not found",
1872+
fields: fields{
1873+
cgroupIDManager: &errCgroupIDManager{
1874+
errListCgroupIDsForPod: general.ErrNotFound,
1875+
},
1876+
},
1877+
args: args{podUID: "test-pod-uid"},
1878+
wantErr: assert.NoError,
1879+
},
1880+
{
1881+
name: "list cgroup id for pod failed",
1882+
fields: fields{
1883+
cgroupIDManager: &errCgroupIDManager{
1884+
errListCgroupIDsForPod: fmt.Errorf("ListCgroupIDsForPod"),
1885+
},
1886+
},
1887+
args: args{
1888+
podUID: "test-pod-uid",
1889+
},
1890+
wantErr: assert.Error,
1891+
},
1892+
{
1893+
name: "clear net class fail",
1894+
fields: fields{
1895+
cgroupIDManager: &cgroupid.CgroupIDManagerStub{
1896+
ContainerCGroupIDMap: map[string]map[string]uint64{
1897+
"test-pod-uid": {
1898+
"test-container-id": 314125,
1899+
},
1900+
},
1901+
},
1902+
networkManager: &errNetworkManager{
1903+
errClearNetClass: fmt.Errorf("ClearNetClass"),
1904+
},
1905+
},
1906+
args: args{
1907+
podUID: "test-pod-uid",
1908+
},
1909+
wantErr: assert.NoError,
1910+
},
1911+
}
1912+
for _, tt := range tests {
1913+
tt := tt
1914+
t.Run(tt.name, func(t *testing.T) {
1915+
t.Parallel()
1916+
1917+
metaServer := makeMetaServer()
1918+
p := &StaticPolicy{
1919+
CgroupV2Env: true,
1920+
metaServer: metaServer,
1921+
}
1922+
1923+
p.metaServer.ExternalManager = &external.DummyExternalManager{
1924+
CgroupIDManager: tt.fields.cgroupIDManager,
1925+
NetworkManager: tt.fields.networkManager,
1926+
}
1927+
time.Sleep(1 * time.Second)
1928+
tt.wantErr(t, p.clearNetClassIfNeed(tt.args.podUID), fmt.Sprintf("clearNetClassIfNeed(%v)", tt.args.podUID))
1929+
})
1930+
}
1931+
}

0 commit comments

Comments
 (0)