117 lines
4.6 KiB
Diff
117 lines
4.6 KiB
Diff
From 44140f192be2eea3a71b3b6372ef45e8535dd802 Mon Sep 17 00:00:00 2001
|
|
From: zhaoxiaohu <zhaoxiaohu@kuaishou.com>
|
|
Date: Thu, 22 Aug 2024 16:39:45 +0800
|
|
Subject: [PATCH] Fix kubelet panic when allocate resource for pod.
|
|
|
|
Reference: https://github.com/kubernetes/kubernetes/pull/119561/commits/d6b8a660b081916f3fae3319581ec2c49a2f5a05
|
|
|
|
Signed-off-by: zhaoxiaohu <zhaoxiaohu@kuaishou.com>
|
|
Signed-off-by: payall4u <payall4u@qq.com>
|
|
Signed-off-by: yuwang <yuwang@kuaishou.com>
|
|
---
|
|
pkg/kubelet/cm/devicemanager/manager.go | 12 ++--
|
|
pkg/kubelet/cm/devicemanager/manager_test.go | 60 ++++++++++++++++++++
|
|
2 files changed, 67 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
|
|
index 95cf058f..1370675b 100644
|
|
--- a/pkg/kubelet/cm/devicemanager/manager.go
|
|
+++ b/pkg/kubelet/cm/devicemanager/manager.go
|
|
@@ -667,6 +667,13 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
|
// Create a closure to help with device allocation
|
|
// Returns 'true' once no more devices need to be allocated.
|
|
allocateRemainingFrom := func(devices sets.String) bool {
|
|
+ // When we call callGetPreferredAllocationIfAvailable below, we will release
|
|
+ // the lock and call the device plugin. If someone calls ListResource concurrently,
|
|
+ // device manager will recalculate the allocatedDevices map. Some entries with
|
|
+ // empty sets may be removed, so we reinit here.
|
|
+ if m.allocatedDevices[resource] == nil {
|
|
+ m.allocatedDevices[resource] = sets.NewString()
|
|
+ }
|
|
for device := range devices.Difference(allocated) {
|
|
m.allocatedDevices[resource].Insert(device)
|
|
allocated.Insert(device)
|
|
@@ -683,11 +690,6 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
|
return allocated, nil
|
|
}
|
|
|
|
- // Needs to allocate additional devices.
|
|
- if m.allocatedDevices[resource] == nil {
|
|
- m.allocatedDevices[resource] = sets.NewString()
|
|
- }
|
|
-
|
|
// Gets Devices in use.
|
|
devicesInUse := m.allocatedDevices[resource]
|
|
// Gets Available devices.
|
|
diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go
|
|
index 9034498c..354dee50 100644
|
|
--- a/pkg/kubelet/cm/devicemanager/manager_test.go
|
|
+++ b/pkg/kubelet/cm/devicemanager/manager_test.go
|
|
@@ -1080,3 +1080,63 @@ func makeDevice(devOnNUMA checkpoint.DevicesPerNUMA, topology bool) map[string]p
|
|
}
|
|
return res
|
|
}
|
|
+
|
|
+func TestDevicesToAllocateConflictWithUpdateAllocatedDevices(t *testing.T) {
|
|
+ podToAllocate := "podToAllocate"
|
|
+ containerToAllocate := "containerToAllocate"
|
|
+ podToRemove := "podToRemove"
|
|
+ containerToRemove := "containerToRemove"
|
|
+ deviceID := "deviceID"
|
|
+ resourceName := "domain1.com/resource"
|
|
+
|
|
+ socket := filepath.Join(os.TempDir(), esocketName())
|
|
+ devs := []*pluginapi.Device{
|
|
+ {ID: deviceID, Health: pluginapi.Healthy},
|
|
+ }
|
|
+ p, e := esetup(t, devs, socket, resourceName, func(n string, d []pluginapi.Device) {})
|
|
+
|
|
+ waitUpdateAllocatedDevicesChan := make(chan struct{})
|
|
+ waitSetGetPreferredAllocChan := make(chan struct{})
|
|
+
|
|
+ p.SetGetPreferredAllocFunc(func(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) {
|
|
+ waitSetGetPreferredAllocChan <- struct{}{}
|
|
+ <-waitUpdateAllocatedDevicesChan
|
|
+ return &pluginapi.PreferredAllocationResponse{
|
|
+ ContainerResponses: []*pluginapi.ContainerPreferredAllocationResponse{
|
|
+ {
|
|
+ DeviceIDs: []string{deviceID},
|
|
+ },
|
|
+ },
|
|
+ }, nil
|
|
+ })
|
|
+
|
|
+ testManager := &ManagerImpl{
|
|
+ endpoints: make(map[string]endpointInfo),
|
|
+ healthyDevices: make(map[string]sets.Set[string]),
|
|
+ unhealthyDevices: make(map[string]sets.Set[string]),
|
|
+ allocatedDevices: make(map[string]sets.Set[string]),
|
|
+ podDevices: newPodDevices(),
|
|
+ activePods: func() []*v1.Pod { return []*v1.Pod{} },
|
|
+ sourcesReady: &sourcesReadyStub{},
|
|
+ topologyAffinityStore: topologymanager.NewFakeManager(),
|
|
+ }
|
|
+
|
|
+ testManager.endpoints[resourceName] = endpointInfo{
|
|
+ e: e,
|
|
+ opts: &pluginapi.DevicePluginOptions{
|
|
+ GetPreferredAllocationAvailable: true,
|
|
+ },
|
|
+ }
|
|
+ testManager.healthyDevices[resourceName] = sets.NewString(deviceID)
|
|
+ testManager.podDevices.insert(podToRemove, containerToRemove, resourceName, nil, nil)
|
|
+
|
|
+ go func() {
|
|
+ <-waitSetGetPreferredAllocChan
|
|
+ testManager.UpdateAllocatedDevices()
|
|
+ waitUpdateAllocatedDevicesChan <- struct{}{}
|
|
+ }()
|
|
+
|
|
+ set, err := testManager.devicesToAllocate(podToAllocate, containerToAllocate, resourceName, 1, sets.NewString())
|
|
+ assert.NoError(t, err)
|
|
+ assert.Equal(t, set, sets.NewString(deviceID))
|
|
+}
|
|
--
|
|
2.33.0
|
|
|