From a6632b7e1474c35fc05d99ec4c05e485048543cd Mon Sep 17 00:00:00 2001 From: ysh Date: Wed, 16 Jul 2025 21:55:19 +0800 Subject: [PATCH] =?UTF-8?q?gpu=E5=8A=9F=E8=83=BD=E5=88=9D=E5=A7=8B?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/k8sManager/multiple_clusters.py | 4 ++ app/k8sManager/ssh_utils.py | 4 +- ...bnvidia-container-tools_1.17.8-1_amd64.deb | Bin .../libnvidia-container1_1.17.8-1_amd64.deb | Bin ...-container-toolkit-base_1.17.8-1_amd64.deb | Bin ...vidia-container-toolkit_1.17.8-1_amd64.deb | Bin files/nvidia-device-plugin.yml | 49 ++++++++++++++++++ script/k8s_install.sh | 42 +++++++++++---- 8 files changed, 86 insertions(+), 13 deletions(-) rename {script => files}/libnvidia-container-tools_1.17.8-1_amd64.deb (100%) rename {script => files}/libnvidia-container1_1.17.8-1_amd64.deb (100%) rename {script => files}/nvidia-container-toolkit-base_1.17.8-1_amd64.deb (100%) rename {script => files}/nvidia-container-toolkit_1.17.8-1_amd64.deb (100%) create mode 100644 files/nvidia-device-plugin.yml diff --git a/app/k8sManager/multiple_clusters.py b/app/k8sManager/multiple_clusters.py index 5bbbe1a..37cd431 100644 --- a/app/k8sManager/multiple_clusters.py +++ b/app/k8sManager/multiple_clusters.py @@ -207,6 +207,10 @@ def new_cluster_install(params): # "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml", "files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml", "files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml", + "files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb", + "files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb", + "files/nvidia-container-toolkit_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit_1.17.8-1_amd64.deb", + "files/nvidia-container-toolkit-base_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit-base_1.17.8-1_amd64.deb", "script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh", "script/import_images.sh": "/opt/import_images.sh", } diff --git a/app/k8sManager/ssh_utils.py b/app/k8sManager/ssh_utils.py index 510d03c..dccd6ec 100644 --- a/app/k8sManager/ssh_utils.py +++ b/app/k8sManager/ssh_utils.py @@ -186,7 +186,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us if time.time() - start_time > sudo_timeout: raise Exception(f"等待sudo密码提示超时({sudo_timeout}秒): {sudo_cmd}") if not ready: - time.sleep(1.5) # 避免CPU占用过高 + time.sleep(0.5) # 避免CPU占用过高 # 如果没有收到密码提示但命令执行超时,可能是权限问题 if not password_prompt: @@ -218,7 +218,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us error += line if channel.exit_status_ready(): break - time.sleep(1.5) # 避免CPU占用过高 + time.sleep(0.1) # 避免CPU占用过高 else: # 非实时模式读取输出 result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else "" diff --git a/script/libnvidia-container-tools_1.17.8-1_amd64.deb b/files/libnvidia-container-tools_1.17.8-1_amd64.deb similarity index 100% rename from script/libnvidia-container-tools_1.17.8-1_amd64.deb rename to files/libnvidia-container-tools_1.17.8-1_amd64.deb diff --git a/script/libnvidia-container1_1.17.8-1_amd64.deb b/files/libnvidia-container1_1.17.8-1_amd64.deb similarity index 100% rename from script/libnvidia-container1_1.17.8-1_amd64.deb rename to files/libnvidia-container1_1.17.8-1_amd64.deb diff --git a/script/nvidia-container-toolkit-base_1.17.8-1_amd64.deb b/files/nvidia-container-toolkit-base_1.17.8-1_amd64.deb similarity index 100% rename from script/nvidia-container-toolkit-base_1.17.8-1_amd64.deb rename to files/nvidia-container-toolkit-base_1.17.8-1_amd64.deb diff --git a/script/nvidia-container-toolkit_1.17.8-1_amd64.deb b/files/nvidia-container-toolkit_1.17.8-1_amd64.deb similarity index 100% rename from script/nvidia-container-toolkit_1.17.8-1_amd64.deb rename to files/nvidia-container-toolkit_1.17.8-1_amd64.deb diff --git a/files/nvidia-device-plugin.yml b/files/nvidia-device-plugin.yml new file mode 100644 index 0000000..eee27cd --- /dev/null +++ b/files/nvidia-device-plugin.yml @@ -0,0 +1,49 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0 + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins \ No newline at end of file diff --git a/script/k8s_install.sh b/script/k8s_install.sh index 3164b91..a0ad0a3 100644 --- a/script/k8s_install.sh +++ b/script/k8s_install.sh @@ -309,16 +309,30 @@ echo "containerd配置初始纠正完成." echo "开始更新containerd配置以适配GPU实例" -# 检查是否有 NVIDIA GPU +# 检查 NVIDIA GPU if lspci | grep -i nvidia > /dev/null 2>&1; then - log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..." + if [ "$1" == "worker" ]; then + log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..." - dpkg -i /opt/*.deb || log_error "安装nvidia-container-runtime及其依赖失败!" + # 检查 .deb 文件是否存在 + if [ ! "$(ls /opt/*.deb 2>/dev/null | wc -l)" -ge 1 ]; then + log_error "/opt/ 下没有 .deb 文件" + exit 1 + fi - # 配置 containerd 支持 nvidia runtime - CONTAINERD_CONFIG="/etc/containerd/config.toml" - if ! grep -q '\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia\]' "$CONTAINERD_CONFIG"; then - cat <> "$CONTAINERD_CONFIG" + # 安装 .deb 包 + for deb in /opt/*.deb; do + dpkg -i "$deb" || log_error "安装 $deb 失败" + done + + # 配置 containerd + CONTAINERD_CONFIG="/etc/containerd/config.toml" + if ! grep -q ' +$$ +plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia +$$ +' "$CONTAINERD_CONFIG"; then + cat <> "$CONTAINERD_CONFIG" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] privileged_without_host_devices = false @@ -326,11 +340,17 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] BinaryName = "/usr/bin/nvidia-container-runtime" EOF - fi + fi - # 重启 containerd - systemctl restart containerd - log_info "nvidia-container-runtime 配置完成,containerd已重启" + # 设置 default_runtime_name = "nvidia" + if ! grep -q '^default_runtime_name = "nvidia"$' "$CONTAINERD_CONFIG"; then + sed -i '/$$plugins."io.containerd.grpc.v1.cri"$$/{n;s/.*/ default_runtime_name = "nvidia"/;}' "$CONTAINERD_CONFIG" + fi + + # 重启 containerd + systemctl restart containerd + log_info "nvidia-container-runtime 配置完成,containerd已重启" + fi else log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置" fi -- 2.34.1