Fix kubelet panic when allocate resource for pod.
Signed-off-by: zhaoxiaohu <zhaoxiaohu@kuaishou.com>
This commit is contained in:
parent
91db644133
commit
cefe431cf4
@ -0,0 +1,116 @@
|
||||
From 44140f192be2eea3a71b3b6372ef45e8535dd802 Mon Sep 17 00:00:00 2001
|
||||
From: zhaoxiaohu <zhaoxiaohu@kuaishou.com>
|
||||
Date: Thu, 22 Aug 2024 16:39:45 +0800
|
||||
Subject: [PATCH] Fix kubelet panic when allocate resource for pod.
|
||||
|
||||
Reference: https://github.com/kubernetes/kubernetes/pull/119561/commits/d6b8a660b081916f3fae3319581ec2c49a2f5a05
|
||||
|
||||
Signed-off-by: zhaoxiaohu <zhaoxiaohu@kuaishou.com>
|
||||
Signed-off-by: payall4u <payall4u@qq.com>
|
||||
Signed-off-by: yuwang <yuwang@kuaishou.com>
|
||||
---
|
||||
pkg/kubelet/cm/devicemanager/manager.go | 12 ++--
|
||||
pkg/kubelet/cm/devicemanager/manager_test.go | 60 ++++++++++++++++++++
|
||||
2 files changed, 67 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
|
||||
index 95cf058f..1370675b 100644
|
||||
--- a/pkg/kubelet/cm/devicemanager/manager.go
|
||||
+++ b/pkg/kubelet/cm/devicemanager/manager.go
|
||||
@@ -667,6 +667,13 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
// Create a closure to help with device allocation
|
||||
// Returns 'true' once no more devices need to be allocated.
|
||||
allocateRemainingFrom := func(devices sets.String) bool {
|
||||
+ // When we call callGetPreferredAllocationIfAvailable below, we will release
|
||||
+ // the lock and call the device plugin. If someone calls ListResource concurrently,
|
||||
+ // device manager will recalculate the allocatedDevices map. Some entries with
|
||||
+ // empty sets may be removed, so we reinit here.
|
||||
+ if m.allocatedDevices[resource] == nil {
|
||||
+ m.allocatedDevices[resource] = sets.NewString()
|
||||
+ }
|
||||
for device := range devices.Difference(allocated) {
|
||||
m.allocatedDevices[resource].Insert(device)
|
||||
allocated.Insert(device)
|
||||
@@ -683,11 +690,6 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
return allocated, nil
|
||||
}
|
||||
|
||||
- // Needs to allocate additional devices.
|
||||
- if m.allocatedDevices[resource] == nil {
|
||||
- m.allocatedDevices[resource] = sets.NewString()
|
||||
- }
|
||||
-
|
||||
// Gets Devices in use.
|
||||
devicesInUse := m.allocatedDevices[resource]
|
||||
// Gets Available devices.
|
||||
diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go
|
||||
index 9034498c..354dee50 100644
|
||||
--- a/pkg/kubelet/cm/devicemanager/manager_test.go
|
||||
+++ b/pkg/kubelet/cm/devicemanager/manager_test.go
|
||||
@@ -1080,3 +1080,63 @@ func makeDevice(devOnNUMA checkpoint.DevicesPerNUMA, topology bool) map[string]p
|
||||
}
|
||||
return res
|
||||
}
|
||||
+
|
||||
+func TestDevicesToAllocateConflictWithUpdateAllocatedDevices(t *testing.T) {
|
||||
+ podToAllocate := "podToAllocate"
|
||||
+ containerToAllocate := "containerToAllocate"
|
||||
+ podToRemove := "podToRemove"
|
||||
+ containerToRemove := "containerToRemove"
|
||||
+ deviceID := "deviceID"
|
||||
+ resourceName := "domain1.com/resource"
|
||||
+
|
||||
+ socket := filepath.Join(os.TempDir(), esocketName())
|
||||
+ devs := []*pluginapi.Device{
|
||||
+ {ID: deviceID, Health: pluginapi.Healthy},
|
||||
+ }
|
||||
+ p, e := esetup(t, devs, socket, resourceName, func(n string, d []pluginapi.Device) {})
|
||||
+
|
||||
+ waitUpdateAllocatedDevicesChan := make(chan struct{})
|
||||
+ waitSetGetPreferredAllocChan := make(chan struct{})
|
||||
+
|
||||
+ p.SetGetPreferredAllocFunc(func(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) {
|
||||
+ waitSetGetPreferredAllocChan <- struct{}{}
|
||||
+ <-waitUpdateAllocatedDevicesChan
|
||||
+ return &pluginapi.PreferredAllocationResponse{
|
||||
+ ContainerResponses: []*pluginapi.ContainerPreferredAllocationResponse{
|
||||
+ {
|
||||
+ DeviceIDs: []string{deviceID},
|
||||
+ },
|
||||
+ },
|
||||
+ }, nil
|
||||
+ })
|
||||
+
|
||||
+ testManager := &ManagerImpl{
|
||||
+ endpoints: make(map[string]endpointInfo),
|
||||
+ healthyDevices: make(map[string]sets.Set[string]),
|
||||
+ unhealthyDevices: make(map[string]sets.Set[string]),
|
||||
+ allocatedDevices: make(map[string]sets.Set[string]),
|
||||
+ podDevices: newPodDevices(),
|
||||
+ activePods: func() []*v1.Pod { return []*v1.Pod{} },
|
||||
+ sourcesReady: &sourcesReadyStub{},
|
||||
+ topologyAffinityStore: topologymanager.NewFakeManager(),
|
||||
+ }
|
||||
+
|
||||
+ testManager.endpoints[resourceName] = endpointInfo{
|
||||
+ e: e,
|
||||
+ opts: &pluginapi.DevicePluginOptions{
|
||||
+ GetPreferredAllocationAvailable: true,
|
||||
+ },
|
||||
+ }
|
||||
+ testManager.healthyDevices[resourceName] = sets.NewString(deviceID)
|
||||
+ testManager.podDevices.insert(podToRemove, containerToRemove, resourceName, nil, nil)
|
||||
+
|
||||
+ go func() {
|
||||
+ <-waitSetGetPreferredAllocChan
|
||||
+ testManager.UpdateAllocatedDevices()
|
||||
+ waitUpdateAllocatedDevicesChan <- struct{}{}
|
||||
+ }()
|
||||
+
|
||||
+ set, err := testManager.devicesToAllocate(podToAllocate, containerToAllocate, resourceName, 1, sets.NewString())
|
||||
+ assert.NoError(t, err)
|
||||
+ assert.Equal(t, set, sets.NewString(deviceID))
|
||||
+}
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
Name: kubernetes
|
||||
Version: 1.20.2
|
||||
Release: 21
|
||||
Release: 22
|
||||
Summary: Container cluster management
|
||||
License: ASL 2.0
|
||||
URL: https://k8s.io/kubernetes
|
||||
@ -40,6 +40,7 @@ Patch6012: 0013-Validate-etcd-paths.patch
|
||||
Patch6013: 0014-fix-node-address-validation.patch
|
||||
Patch6014: 0015-Add-ephemeralcontainer-to-imagepolicy-securityaccoun.patch
|
||||
Patch6015: 0016-Add-envFrom-to-serviceaccount-admission-plugin.patch
|
||||
Patch6016: 0017-backport-Fix-kubelet-panic-when-allocate-resource-for-pod.patch
|
||||
|
||||
%description
|
||||
Container cluster management.
|
||||
@ -271,6 +272,12 @@ getent passwd kube >/dev/null || useradd -r -g kube -d / -s /sbin/nologin \
|
||||
%systemd_postun kubelet kube-proxy
|
||||
|
||||
%changelog
|
||||
* Thu Aug 22 2024 zhaoxiaohu <zhaoxiaohu@kuaishou.com> - 1.20.2-22
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix kubelet panic when allocate resource for pod. #119561
|
||||
|
||||
* Mon Apr 29 2024 liuxu <liuxu156@huawei.com> - 1.20.2-21
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user