解决多张gpu卡情况下测试失败问题

This commit is contained in:
gaochuanji 2024-08-30 16:37:22 +08:00
parent 7b80b7d64f
commit b8c3356c93
2 changed files with 43 additions and 1 deletions

View File

@ -0,0 +1,36 @@
diff -Naur rpm/tests/compatible/gpu/nvidia_gpu.py oech/tests/compatible/gpu/nvidia_gpu.py
--- rpm/tests/compatible/gpu/nvidia_gpu.py 2024-08-29 19:48:19.472522063 +0800
+++ oech/tests/compatible/gpu/nvidia_gpu.py 2024-08-30 16:21:51.821261220 +0800
@@ -73,6 +73,11 @@
self.logger.info("Set default test gpu as %s." % id_num)
+ def clean_default_gpu(self):
+ if 'CUDA_VISIBLE_DEVICES' in os.environ:
+ del os.environ['CUDA_VISIBLE_DEVICES']
+ self.logger.info("Clean default test gpu.")
+
def test_pressure(self):
"""
Set pressure for gpu to test
@@ -87,10 +92,7 @@
os.chdir("/opt/gpu-burn")
cmd = self.command.run_cmd(
- "nvidia-smi -q | grep -i -A1 '%s' | grep 'Product Name' | cut -d ':' -f 2" % pci_num)
- device_name = cmd[0].strip()
- cmd = self.command.run_cmd(
- "./gpu_burn -l | grep -i '%s' | cut -d ':' -f 1 | awk '{print $2}'" % device_name)
+ "nvidia-smi -q | grep -i -A20 '%s' | grep 'Minor Number' | cut -d ':' -f 2" % pci_num)
run_id = cmd[0].strip()
cmd = getstatusoutput(
'nohup ./gpu_burn -i%s 10 &> %s &' % (run_id, self.gpu_burn))
@@ -185,6 +187,8 @@
result = False
self.logger.error("Test Vulkan failed.")
+ self.clean_default_gpu()
+
except Exception as e:
self.logger.error(
"Failed to run the script because compiling or setting variables: %s" % e)

View File

@ -6,7 +6,7 @@
Name: oec-hardware
Summary: openEuler Hardware Compatibility Test Suite
Version: 1.1.5
Release: 5
Release: 6
Group: Development/Tools
License: Mulan PSL v2
URL: https://gitee.com/openeuler/oec-hardware
@ -18,6 +18,7 @@ Patch0002: oec-hardware-1.1.5-2-fix-bug.patch
Patch0003: oec-hardware-1.1.5-3-functional-optimization-fix-bug.patch
Patch0004: oec-hardware-1.1.5-4-fix-gpu-testcase-bug.patch
Patch0005: oec-hardware-1.1.5-5-fix-cpufreq-userspace.patch
Patch0006: oec-hardware-1.1.5-6-fix-bug-gpu.patch
Buildroot: %{_tmppath}/%{name}-%{version}-root
BuildRequires: gcc
@ -46,6 +47,7 @@ openEuler Hardware Compatibility Test Server
%patch3 -p1
%patch4 -p1
%patch5 -p1
%patch6 -p1
%build
%ifarch x86_64 aarch64
@ -93,6 +95,10 @@ sed -i 's#grep openeulerversion /etc/openEuler-latest#grep %{vendor_lowercase}ve
rm -rf /var/lock/oech.lock
%changelog
* Fri Aug 30 2024 gaochuanji <gaochuanji@inspur.com> - 1.1.5-6
- Resolve the issue of the gpu_burn test failed when multiple gpus
- Resolve the issue of first gpu test success and others failed when multiple gpus
* Tue Aug 13 2024 gaochuanji <gaochuanji@inspur.com> - 1.1.5-5
- Resolve the issue of failed userspace testing for cpufreq on certain models